import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
priced_sets = pd.read_csv("./data/priced_sets.csv")
themes = pd.read_csv("./data/themes.csv",sep=';')
inventories = pd.read_csv("./data/inventories.csv",sep=';')
inventory_parts = pd.read_csv("./data/inventory_parts.csv",sep=',')
colors = pd.read_csv("./data/colors.csv",sep=',')
# Adapt the data to the needs of the project
priced_sets["year"] = pd.to_numeric(priced_sets["year"],errors="coerce")
priced_sets['Retail_price'] = priced_sets['Retail_price'].str.replace('€', '')
priced_sets['Retail_price'] = priced_sets['Retail_price'].str.replace(',', '.')
priced_sets['Retail_price'] = priced_sets['Retail_price'].str.replace('promotional', '0')
priced_sets.replace('Unknown',pd.NA)
priced_sets['Retail_price'] = priced_sets['Retail_price'].str.replace('Included with magazine', '0')
priced_sets['Current_price'] = priced_sets['Current_price'].str.replace('€', '')
priced_sets['Current_price'] = priced_sets['Current_price'].str.replace(',', '.')
priced_sets["Retail_price"] = pd.to_numeric(priced_sets["Retail_price"],errors="coerce")
# change name of the columns to make them more readable
themes = themes.rename(columns={"name":"theme_name"})
inventories = inventories.rename(columns={"id":"inventory_id"})
colors = colors.rename(columns={"id":"color_id"})
colors = colors.rename(columns={"name":"color_name"})
#Changes retail prices to current value
change_factor = {
1947: 39.464, 1948: 37.272, 1949: 36.733, 1950: 37.234, 1951: 33.937, 1952: 32.554, 1953: 31.934,
1954: 31.097, 1955: 30.250, 1956: 28.815, 1957: 28.269, 1958: 26.978, 1959: 27.089, 1960: 26.389,
1961: 25.640, 1962: 24.395, 1963: 22.690, 1964: 21.420, 1965: 20.528, 1966: 20.125, 1967: 19.731,
1968: 19.482, 1969: 18.950, 1970: 18.033, 1971: 17.175, 1972: 16.261, 1973: 14.733, 1974: 12.335,
1975: 10.527, 1976: 9.035, 1977: 7.650, 1978: 6.803, 1979: 5.878, 1980: 4.852, 1981: 4.088, 1982: 3.514,
1983: 3.056, 1984: 2.763, 1985: 2.544, 1986: 2.398, 1987: 2.292, 1988: 2.184, 1989: 2.049, 1990: 1.931,
1991: 1.815, 1992: 1.721, 1993: 1.652, 1994: 1.590, 1995: 1.509, 1996: 1.452, 1997: 1.427, 1998: 1.402,
1999: 1.380, 2000: 1.346, 2001: 1.311, 2002: 1.280, 2003: 1.249, 2004: 1.225, 2005: 1.204, 2006: 1.181,
2007: 1.161, 2008: 1.124, 2009: 1.116, 2010: 1.099, 2011: 1.070, 2012: 1.039, 2013: 1.027,
2014: 1.025, 2015: 1.026, 2016: 1.027, 2017: 1.016, 2018: 1.005
}
priced_sets = priced_sets.assign(Current_Retail_Price = lambda x: x['Retail_price']*x['year'].map(change_factor))
# create plots to show the evolution of the prices
box_retail_price = px.box(priced_sets, x="year", y="Retail_price",labels={'year':'Year' ,'Retail_price':'Retail Price'},range_y=[0,700])
box_current_retail_price = px.box(priced_sets,x="year",y="Current_Retail_Price",labels={'year':'Year' ,'Current_Retail_Price':'Normalizied Retail Price'},range_y=[0,700])
# group the data by group of years and calculate the median of the prices
import warnings
warnings.filterwarnings("ignore")
median_price_per_year = priced_sets.groupby('year').median()
median_price_per_year = median_price_per_year.reset_index()
median_price_per_year = median_price_per_year.dropna()
median_price_per_year = median_price_per_year.sort_values(by=['year'],ascending=True)
grouped = median_price_per_year[median_price_per_year['year'] >= 1947]
grouped = grouped[median_price_per_year['year'] <= 1980]
not_grouped = median_price_per_year[median_price_per_year['year'] > 1980]
#Labeled as 1975 and not 1947-1980 for the sake of the graph
grouped['year_group'] = pd.cut(median_price_per_year['year'], bins=[1947, 1980], labels=['1975'])
grouped = grouped.dropna()
grouped = grouped.sort_values(by=['year'],ascending=True)
grouped = grouped.reset_index()
grouped = grouped.drop(columns=['index'])
grouped = grouped.drop(columns=['year'])
grouped = grouped.drop(columns=['id'])
grouped = grouped.drop(columns=['theme_id'])
grouped = grouped.rename(columns={'Current_Retail_Price':'Median Retail Price'})
grouped = grouped.rename(columns={'year_group':'years'})
grouped = grouped.groupby('years', as_index=False).median()
last_year = 1980
for i in range(len(not_grouped)):
not_grouped = median_price_per_year[median_price_per_year['year'] > 1980]
last_year = last_year + 1
not_grouped['year_group'] = pd.cut(median_price_per_year['year'], bins=[last_year-1,last_year], labels=[last_year])
not_grouped = not_grouped.dropna()
not_grouped = not_grouped.sort_values(by=['year'],ascending=True)
not_grouped = not_grouped.reset_index()
not_grouped = not_grouped.drop(columns=['index'])
not_grouped = not_grouped.drop(columns=['year'])
not_grouped = not_grouped.drop(columns=['id'])
not_grouped = not_grouped.drop(columns=['theme_id'])
not_grouped = not_grouped.rename(columns={'Current_Retail_Price':'Median Retail Price'})
not_grouped = not_grouped.rename(columns={'year_group':'years'})
not_grouped = not_grouped.groupby('years', as_index=False).median()
grouped = grouped.append(not_grouped)
grouped["years"] = pd.to_numeric(grouped["years"],errors="coerce")
grouped = grouped.sort_values(by=['years'],ascending=True)
#create a scatter plot to show the median price per year
median_price_per_year_line = px.line(grouped, x="years", y="Median Retail Price",labels={'years':'Year'},range_x=[1975,2017],range_y=[0,100],title='Median Retail Price Per Year')
median_price_per_year_line.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = [1975,1981,1985,1990,1995,2000,2005,2010,2015,2017],
ticktext = ['1947-1980',1981,1985,1990,1995,2000,2005,2010,2015,2017]
),
yaxis = dict(
tickmode = 'array',
tickvals = [20,50,75,100]
)
)
print()
#display the plots
box_retail_price.show()
box_current_retail_price.show()
median_price_per_year_line.show()
# show the 15 most produced themes in number of sets
themes_sets = pd.merge(left=priced_sets, right=themes, how='left', left_on='theme_id', right_on='id')
top = themes_sets['theme_id'].value_counts().head(15)
top_15_themes = pd.DataFrame({'id':top.index, 'count':top.values})
top_15_themes = pd.merge(top_15_themes, themes, on='id')
top_15_themes = top_15_themes.sort_values(by=['count'],ascending=False)
top_15_themes = top_15_themes.reset_index()
top_15_themes = top_15_themes.drop(columns=['parent_id','index'])
top_15_themes.sort_values('count',ascending=True,inplace=True)
top_15_themes.drop_duplicates(subset ="theme_name", keep = 'first', inplace = True)
p = px.bar(top_15_themes, x=top_15_themes['count'], y=top_15_themes['theme_name'], title='Top 15 More Produced Themes',labels={'x':'Number of sets','y':'Theme'})
p.update_layout(
xaxis={'side': 'top'},
)
p.show()
# show the top 10 sets with the highest number of parts
set_num_parts_sorted = priced_sets.sort_values(by=['num_parts'],ascending=False).drop_duplicates(subset=['name']).head(10)
set_num_parts_sorted = set_num_parts_sorted.sort_values(by=['num_parts'],ascending=True)
number_of_components = px.bar(set_num_parts_sorted,x="num_parts", y="name",
range_x=[0,6000],
labels={"num_parts": "Number of parts","name": "Set name"})
number_of_components.update_layout(
xaxis={'side': 'top'},
)
number_of_components.show()
#show the regression line of the mean number of parts per year
b = 0
w = 4
mean_num_parts_per_year = priced_sets.groupby('year', as_index=False).mean()
mean_num_parts_per_year = mean_num_parts_per_year.reset_index()
x = mean_num_parts_per_year.index
y_hat = b + w*x
x_line = np.array([np.min(x), np.max(x)])
y_line = b + w*x_line
data = [
go.Scatter(x=x, y= mean_num_parts_per_year['num_parts'], mode='markers', name='Data'),
go.Scatter(x=x, y=y_hat, mode="markers", name="estimate"),
go.Scatter(x=x_line, y=y_line, mode='lines', name='Regression Line')
]
for i in range(len(x)):
data.append(go.Scatter(x=[x[i], x[i]], y=[mean_num_parts_per_year['num_parts'][i], y_hat[i]], mode="lines",
showlegend=False, line=dict(color="gray", width=0.5)),)
layout = go.Layout(title='Regression line of Average number of parts per sets per year (y=4x)', xaxis={'title':'Year passed from 1950'}, yaxis={'title':'Average number of parts'})
fig = go.Figure(data=data, layout=layout)
fig.show()
Percentage of Gain/Loss formula
$$P = {S - R \over R}*100$$P = Percentage of Gain/Loss S = Sell price R = Retail price
# calculate the profit percentage of each set
no_promo_sets = priced_sets.loc[priced_sets['Current_price'] != 0]
no_promo_sets = no_promo_sets.loc[no_promo_sets['Current_Retail_Price'] != 0]
no_promo_sets['Current_price'] = pd.to_numeric(no_promo_sets['Current_price'],errors='coerce')
no_promo_sets['Current_Retail_Price'] = pd.to_numeric(no_promo_sets['Current_Retail_Price'],errors='coerce')
no_promo_sets = no_promo_sets.loc[no_promo_sets['Current_price'] > 0]
no_promo_sets = no_promo_sets.loc[no_promo_sets['Current_Retail_Price'] > 0]
no_promo_sets['Percentage'] = no_promo_sets.apply(lambda x: ((x['Current_price'] - x['Current_Retail_Price'])/x['Current_Retail_Price'] * 100),axis=1)
no_promo_sets['Percentage'] = pd.to_numeric(no_promo_sets['Percentage'],errors='coerce')
# show the graph of the percentage of gain per number of set
fig = go.Figure()
fig.add_trace(
go.Histogram(
x=no_promo_sets['Percentage'],
name='Gain',
xbins=dict(
start=0,
end=500,
size=5
),
marker_color='green'
)
)
fig.add_trace(
go.Histogram(
x=no_promo_sets['Percentage'],
name='Loss',
xbins=dict(
start=-100,
end=0,
size=5
),
marker_color='red'
)
)
fig.add_shape(
dict(
type="line",
x0=0,
y0=0,
x1=0,
y1=200,
line=dict(
color="Red",
width=1
),
))
fig.update_layout(
title_text='Percentage Gain or Loss since release',
xaxis_title_text='Percentage Gain or Loss',
yaxis_title_text='Number of Sets',
bargap=0.2,
bargroupgap=0.1,
)
fig.show()
# show the top 10 sets with the highest profit percentage
top_10 = no_promo_sets.sort_values(by=['Percentage'],ascending=False).head(10)
top_10 = top_10.sort_values(by=['Percentage'],ascending=True)
top_10 = top_10.reset_index()
top_10 = top_10.drop(columns=['index'])
top_10 = top_10.drop(columns=['id'])
top_10 = top_10.drop(columns=['theme_id'])
top_10 = top_10.drop(columns=['set_num'])
top_10 = top_10.drop(columns=['num_parts'])
top_10 = top_10.rename(columns={'name':'Set Name'})
top_10 = top_10.drop_duplicates(subset=['Set Name'])
Original mathematical expression $$ I = {C * r * t \over 100} $$ Mathematical expression used $$ r = {I * 100 \over C * t }$$ C = Original Normalized Retail price I = Gain/loss in Euros ( Current Price - Retail Price) t = number of years, from production year to 2022 r = annual percentage profit
# show the top 10 sets with the highest profit percentage per year
no_promo_sets['Annual_Percentage'] = no_promo_sets.apply(lambda x: ((x['Current_price'] - x['Current_Retail_Price'])*100/(x['Current_Retail_Price']*(2022-x['year']))),axis=1)
no_promo_sets['Annual_Percentage'] = pd.to_numeric(no_promo_sets['Annual_Percentage'],errors='coerce')
top_10_annual_percentage = no_promo_sets.sort_values(by=['Annual_Percentage'],ascending=False).head(10)
top_10_annual_percentage = top_10_annual_percentage.sort_values(by=['Annual_Percentage'],ascending=True)
top_10_annual_percentage = top_10_annual_percentage.reset_index()
top_10_annual_percentage = top_10_annual_percentage.drop(columns=['index'])
top_10_annual_percentage = top_10_annual_percentage.drop(columns=['id'])
top_10_annual_percentage = top_10_annual_percentage.drop(columns=['set_num'])
top_10_annual_percentage = top_10_annual_percentage.drop(columns=['num_parts'])
top_10_annual_percentage = top_10_annual_percentage.rename(columns={'name':'Set Name'})
top_10_annual_percentage = top_10_annual_percentage.drop_duplicates(subset=['Set Name'])
# calculate and show the top 10 themes with the highest profit percentage per year and from release
theme_more_profitable = no_promo_sets.groupby('theme_id').median()
theme_more_profitable.drop(columns=['id','year','Current_price','Current_Retail_Price','num_parts','Retail_price'],inplace=True)
theme_more_profitable_name = theme_more_profitable.merge(themes, left_on='theme_id', right_on='id')
theme_more_profitable_name.drop(columns=['parent_id'],inplace=True)
theme_more_profitable_name = theme_more_profitable_name.reset_index()
theme_more_total_profitable = theme_more_profitable_name.sort_values(by=['Percentage'],ascending=False).head(10)
theme_more_annual_profitable = theme_more_profitable_name.sort_values(by=['Annual_Percentage'],ascending=False).head(10)
theme_more_total_profitable = theme_more_total_profitable.sort_values(by=['Percentage'],ascending=True).head(10)
theme_more_annual_profitable = theme_more_annual_profitable.sort_values(by=['Annual_Percentage'],ascending=True).head(10)
fig = px.bar(theme_more_total_profitable, x='Percentage', y='theme_name',title='Top 10 Themes with the most profit since release',orientation='h')
fig.update_layout(
xaxis={'side': 'top'},
title=dict(
x=0.5,
y=0.95,
xanchor='center',
yanchor='top',
)
)
fig.show()
fig = px.bar(theme_more_annual_profitable, x='Annual_Percentage', y='theme_name',title='Top 10 Themes with the most profit per year since release',orientation='h')
fig.update_layout(
xaxis={'side': 'top'},
title=dict(
x=0.5,
y=0.95,
xanchor='center',
yanchor='top',
)
)
fig.show()
# show the top 10 sets with the highest profit percentage per year and from release
top_10_annual_percentage = px.bar(top_10_annual_percentage, y="Set Name", x="Annual_Percentage",labels={'Annual_Percentage':'Annual Percentage'},title='Top 10 Sets with the Highest Annual Percentage of Profit')
top_10_annual_percentage.show()
top_10 = px.bar(top_10, y="Set Name", x="Percentage",labels={'Percentage':'Percentage Gain'},title='Top 10 Sets with the Highest Percentage of Gain since release')
top_10.show()
# search the top 6 produced themes
top_6_themes = themes_sets['theme_id'].value_counts().head(6)
themes_to_consider = themes_sets
themes_to_consider = themes_to_consider[themes_to_consider['theme_id'].isin(top_6_themes.index)]
themes_to_consider = themes_to_consider['theme_name'].unique()
top_10_colors_correct = inventory_parts.groupby(['color_id']).size().reset_index(name='counts')
top_10_colors_correct = top_10_colors_correct.sort_values(by=['counts'],ascending=False).head(10)
colors_set = pd.read_csv('./data/colors.csv')
top_10_colors_for_graph = colors_set.loc[colors_set['id'].isin(top_10_colors_correct['color_id'].values)]
top_10_colors_for_graph['counts'] = top_10_colors_for_graph.apply(lambda _: '', axis=1)
for i in range(len(top_10_colors_for_graph)):
top_10_colors_for_graph['counts'].iloc[i] = top_10_colors_correct['counts'].iloc[i]
top_10_colors_graph = px.bar(top_10_colors_for_graph, x="name", y="counts",labels={'counts':'Number of Parts'},title='Top 10 Colors with the Highest Number of Parts')
number_of_components.update_layout(
xaxis={'side': 'top'},
)
top_10_colors_graph.show()
top_5_colors = top_10_colors_for_graph['name'].head(5).to_numpy()
# calculate the most used colors in the top 6 themes
themes_merged_set = priced_sets.merge(themes, left_on='theme_id', right_on='id')
themes_merged_set = themes_merged_set.drop(columns=['id_x','id_y','parent_id'])
themes_merged_set = themes_merged_set.merge(inventories, left_on='set_num', right_on='set_num')
themes_merged_set = themes_merged_set.merge(inventory_parts, on='inventory_id')
colors_set.rename(columns={'id':'color_id'},inplace=True)
colors_set.rename(columns={'name':'color_name'},inplace=True)
themes_merged_set = themes_merged_set.merge(colors_set, on='color_id')
set_for_friends = themes_merged_set
themes_merged_set = themes_merged_set.loc[themes_merged_set['theme_name'].isin(themes_to_consider)]
for i in range(len(themes_merged_set)):
if themes_merged_set['color_name'].iloc[i] not in top_5_colors:
themes_merged_set['color_name'].iloc[i] = 'other'
themes_merged_set_grouped = themes_merged_set.groupby(['theme_name','color_name']).size().reset_index(name='counts')
themes_merged_set_grouped = themes_merged_set_grouped.reset_index()
# show the most used colors in the top 6 themes
from plotly.subplots import make_subplots
rows = 2
cols = 3
top_5_colors.sort()
if 'other' not in top_5_colors:
top_5_colors = np.append(top_5_colors,'other')
fig = make_subplots(rows=rows, cols=cols, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}],
[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]],
horizontal_spacing=0.1, vertical_spacing=0)
i,j = 1,1
for theme in themes_to_consider:
theme_df = themes_merged_set_grouped.loc[themes_merged_set_grouped['theme_name'] == theme]
for color in top_5_colors:
if color not in theme_df['color_name'].values:
theme_df = theme_df.append({'theme_name':theme,'color_name':color,'counts':0},ignore_index=True)
val = theme_df.sort_values(by=['color_name'],ascending=True)
fig.add_trace(go.Pie(labels=top_5_colors, values=val['counts'].values, name=theme),i,j)
if j == cols:
j = 1
i += 1
else:
j += 1
c = []
for color in top_5_colors:
if(color == 'other'):
c.append('#AAAAAA33')
elif(color == 'Light Gray'):
c.append('#d3d3d3')
else:
c.append(color)
fig.update_traces(marker=dict(
colors=c,
line = dict(color='#000000', width=1)
)
)
i,j = 0,0
position_x = [0.11,0.50,0.91]
position_y = [0.58,0.05]
annotations = []
for theme in themes_to_consider:
annotations.append(dict(text=theme, x=position_x[j], y=position_y[i], font_size=20, showarrow=False))
if j == cols-1:
j = 0
i += 1
else:
j += 1
fig.update_layout(
title_text="Top 5 Colors Distribution in Top 6 Themes",
title_x=0.5,
title_y=0.95,
# Add annotations in the center of the donut pies.
annotations=annotations,
legend=dict(
traceorder="normal",
font=dict(
family="sans-serif",
size=15,
color="black"
),
bordercolor="Black",
borderwidth=2
),
height=1000,
)
fig.show()
# show the most used colors in the friends theme
lego_friends = set_for_friends.loc[set_for_friends['theme_name'] == 'Friends'].groupby(['color_id']).size().reset_index(name='counts')
lego_friends = lego_friends.sort_values(by=['counts'],ascending=False)
lego_friends = lego_friends.head(10)
lego_friends = lego_friends.reset_index()
lego_friends = lego_friends.merge(colors_set, on='color_id')
lego_friends = lego_friends.rename(columns={'name':'color_name'})
lego_friends = lego_friends[['color_name','counts']]
lego_friends = lego_friends.sort_values(by=['counts'],ascending=False)
lego_friends = lego_friends.reset_index()
lego_friends = lego_friends.drop(columns=['index'])
fig = px.bar(lego_friends, x='color_name', y='counts')
fig.update_layout(
title_text="Top 10 Colors Distribution in Friends Theme",
title_x=0.5,
title_y=0.95,
legend=dict(
traceorder="normal",
font=dict(
family="sans-serif",
size=15,
color="black"
),
bordercolor="Black",
borderwidth=2
),
)
fig.show()